# -*- coding: utf-8 -*-
import re
import pandas as pd
import jieba.analyse
from pyecharts import options as opts
from pyecharts.charts import WordCloud
# 数据读取
datas = pd.read_csv('/data/dataset/weibo/6a.csv',encoding='utf-8')
print(datas.head(5))
# 热点话题的评论分词处理
# 停用词处理
jieba.analyse.set_stop_words('/data/dataset/weibo/chineseStopWords.txt')
# 对评论内容进一步分词
words = []
for i in datas['评论内容']:
    # 进一步去除影响的词语
    i = re.sub('[0-9’!'#$%&\'()*+,-./:;<=>?@，。～?★、…【】《》？“”‘’！[\\]^_`{|}~\s]+', '', i)
    # 热门关键词提取
    seg_list = jieba.analyse.extract_tags(i, topK=5, withWeight=True)
    for k,v in seg_list:
        v = v*10000
        words.append((k,v))
# 渲染图
def wordcloud_base() -> WordCloud:
    c = (
         WordCloud()
        .add('', words, word_size_range=[20, 100], shape='diamond')
        .set_global_opts(title_opts=opts.TitleOpts(title='WordCloud词云'))
)
    return c
# 生成图
wordcloud_base().render('词云图.html')